#### Paper title: Introducing the MAVERICK dataset
#### Purpose: Regression analysis and robustness tests
#### Author: Sebastian van Baalen & Kristine Höglund
#### Last updated: 2025-10-14

#### Front matter ####

# Install packages

# Uncomment to install packages

# install.packages("here") # To use relative paths
# install.packages("devtools") # To install packages from Github
# install.packages("MASS") " To use negative binomial regression
# install.packages("eventreport") # To load and aggregate the MAVERICK data
# install.packages("tidyverse") # To work with tidy data
# install.packages("sf") # To work with spatial data
# install.packages("lmtest") # To calculate robust clustered standard errors
# install.packages("sandwich") # To calculate robust clustered standard errors
# install.packages("stargazer") # To create regression tables
# install.packages("marginaleffects") # To calculate marginal effects
# install.packages("ggplot2") # To visualize data
# install.packages("wesanderson") # To use the Wes Anderson color palettes
# install.packages("car") # To conduct hypothesis testing
# install.packages("scales") # To adapt plot scales

# Install the vdemdata package from Github

# devtools::install_github("vdeminstitute/vdemdata") # To load the V-Dem data

# Install the eventreport package from Github

# devtools::install_github("sebastianvanbaalen/eventreport") # To load and aggregate the MAVERICK data

# Load packages

suppressWarnings(suppressMessages(suppressPackageStartupMessages({
  library(here) # To use relative paths
  library(MASS) # To use negative binomial regression
  library(eventreport) # To load and aggregate the MAVERICK data
  library(tidyverse) # To work with tidy data
  library(sf) # To work with spatial data
  library(lmtest) # To calculate robust clustered standard errors
  library(sandwich) # To calculate robust clustered standard errors
  library(stargazer) # To create regression tables
  library(marginaleffects) # To calculate marginal effects
  library(scales) # To adapt plot scales
  library(fixest) # For calculating Conley standard errors 
  library(ggplot2) # To visualize data
  library(wesanderson) # To use the Wes Anderson color palettes
  library(car) # To conduct hypothesis testing
})))

# Load MAVERICK data using the eventreport package

maverick_inf <- aggregate_maverick_inf()

maverick_rep <- aggregate_maverick_rep()

maverick_con <- aggregate_maverick_con()

maverick <- maverick_event_report

# Load shapefiles

# Available at: https://data.humdata.org/dataset/geoboundaries-admin-boundaries-for-cote-d-ivoire

shapefile_civ <- st_read('civ_admbnda_adm1_cntig_ocha_itos_20180706/civ_admbnda_adm1_cntig_ocha_itos_20180706.shp')

# Available at: https://data.humdata.org/dataset/geoboundaries-admin-boundaries-for-kenya

shapefile_ke <- st_read('geoBoundaries-KEN-ADM1-all/geoBoundaries-KEN-ADM1.shp')

# Define custom plot theme

my_theme <- theme_bw() +
  theme(
    plot.margin = margin(1, 1, 1, 1, "cm"),
    axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size = 20),
    axis.title.y.right = element_text(margin = margin(t = 0, r = 0, b = 0, l = 10), size = 20),
    axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size = 20),
    axis.text.x = element_text(size = 18),
    axis.text.y = element_text(size = 18),
    legend.position = "bottom",
    legend.title = element_text(size = 18),
    legend.text = element_text(size = 18),
    strip.text = element_text(size = 18),
    strip.background = element_rect(fill = "white", color = "black"),
    plot.title = element_text(size = 20, face = "bold", hjust = 0, margin = margin (t = 0, r = 0, b = 10, l = 0)),
    plot.subtitle = element_text(size = 14, hjust = 0, margin = margin (t = 0, r = 0, b = 40, l = 0)),
    plot.caption = element_text(size = 16, hjust = 1, margin = margin (t = 20, r = 0, b = 0, l = 0)),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank()
  )

#### Prepare main dataset ####

# Make dataset spatial

maverick_sf <- maverick_rep %>%
  filter(geo_precision > 2 & !is.na(latitude) & !is.na(longitude) & latitude != 0 & longitude != 0) %>% 
  mutate(year = year(as.Date(date_start, format = "%Y-%m-%d"))) %>% 
  mutate(country_year = paste(country, year, sep = "_")) %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326, remove = FALSE)

# Add district names from shapefiles

maverick_sf <- maverick_sf %>%
  st_join(shapefile_civ %>% select(ADM1_FR), left = TRUE) %>% 
  st_join(shapefile_ke %>% select(shapeName), left = TRUE) %>% 
  mutate(
    district = case_when(
      country == "Ivory Coast" ~ ADM1_FR,
      TRUE ~ shapeName
    )
  ) %>%
  filter(!is.na(district)) %>% 
  select(-ADM1_FR, -shapeName)

# Load PRIO-Grid

prio_grid <- read.csv("PRIO-GRID Static Variables - 2024-06-14.csv")

# Load PRIO-Grid shapefile

shapefile_prio_grid <- st_read("priogrid_cellshp/priogrid_cell.shp")

# Join PRIO-Grid shapefile with PRIO-Grid dataset

shapefile_prio_grid <- shapefile_prio_grid %>% 
  left_join(prio_grid, by = "gid")

# Join dataset with PRIO-Grid dataset

maverick_sf <- maverick_sf %>% 
  st_join(shapefile_prio_grid)

# Create variables for analysis 

maverick_sf <- maverick_sf %>% 
  # Create security force involvement variable
  mutate(
    event_context = case_when(
      event_context == "Canvasing" ~ "Other context",
      event_context == "Incumbent campaign event" ~ "Campaign event",
      event_context == "Arrest or raid" ~ "Military operation",
      event_context == "Opposition campaign event" ~ "Campaign event",
      event_context == "Riot" ~ "Protest or riot",
      event_context == "Protest" ~ "Protest or riot",
      event_context == "Security force crack-down on protest" ~ "Protest or riot",
      event_context == "Vote counting" ~ "Voting procedures",
      event_context == "Voting" ~ "Voting procedures",
      event_context == "Indeterminate" ~ "Unknown",
      TRUE ~ event_context
    ),
    sf_involve = case_when(
      actor1_type == "Security forces" & actor1_bystander != 1 ~ 1,
      actor2_type == "Security forces" & actor2_bystander != 1 ~ 1,
      actor3_type == "Security forces" & actor3_bystander != 1 ~ 1,
      actor4_type == "Security forces" & actor4_bystander != 1 ~ 1,
      actor5_type == "Security forces" & actor5_bystander != 1 ~ 1,
      actor6_type == "Security forces" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    police_involve = case_when(
      actor1_subtype == "Security forces: Police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    army_involve = case_when(
      actor1_subtype == "Security forces: Army" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Army" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Army" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Army" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Army" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Army" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    ppolice_involve = case_when(
      actor1_subtype == "Security forces: Paramilitary police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Paramilitary police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Paramilitary police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Paramilitary police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Paramilitary police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Paramilitary police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  )

# Convert the variables to factors

maverick_sf$country <- as.factor(maverick_sf$country)

maverick_sf$event_context <- as.factor(maverick_sf$event_context)

maverick_sf$election <- as.factor(maverick_sf$election)

# Set the reference categories 

maverick_sf$country <- relevel(maverick_sf$country, ref = "Kenya")

maverick_sf$event_context <- relevel(maverick_sf$event_context, ref = "Other context")

maverick_sf$election <- relevel(maverick_sf$election, ref = "CI-Presidential-1995")

# Remove events in elections with fewer than 10 violent events or for which the election is unknown

maverick_sf <- maverick_sf %>% 
  group_by(election) %>% 
  mutate(count = n()) %>% 
  ungroup() %>% 
  filter(count >= 10 & election != "Other" & election != "Indeterminate") 

#### Table BII: Descriptive statistics ####

maverick_sf %>% 
  select(
    deaths_best,
    injuries_best,
    damage,
    sf_involve,
    police_involve,
    ppolice_involve,
    army_involve,
    agri_gc,
    ttime_mean,
    event_context,
    country
  ) %>%
  as.data.frame() %>% 
  stargazer(
    covariate.labels = c(
      "Best estimated number of deaths",
      "Best estimate number of injured people",
      "Material destruction",
      "Security force involvement",
      "Police involvement",
      "Paramilitary police involvement",
      "Army involvement",
      "Agricultural land cover",
      "Travel time to the nearest urban center"
    ),
    title = "Descriptive statistics",
    digits = 2,
    font.size = "footnotesize",
    omit.summary.stat = c("p25", "p75"),
    label = "tab:descriptive",
    table.placement = "t",
    median = TRUE,
    type = "text"
    # type = "latex",
    # out = "table_c2.tex"
  )

#### Table I: Main analysis ####

##### Model 1 ####

model1 <- glm.nb(deaths_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model1, vcov = vcovCL(model1, cluster = ~ district))

##### Model 2 ####

model2 <- glm.nb(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model2, vcov = vcovCL(model2, cluster = ~ district))

##### Model 3 ####

model3 <- glm.nb(injuries_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model3, vcov = vcovCL(model3, cluster = ~ district))

##### Model 4 ####

model4 <- glm.nb(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model4, vcov = vcovCL(model4, cluster = ~ district))

##### Model 5 ####

model5 <- glm(damage ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model5, vcov = vcovCL(model5, cluster = ~ district))

##### Model 6 ####

model6 <- glm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model6, vcov = vcovCL(model6, cluster = ~ district))

# Create lists of clustered standard errors

cluster_se1 <- sqrt(diag(vcovCL(model1, type = "HC", cluster = ~ district)))
cluster_se2 <- sqrt(diag(vcovCL(model2, type = "HC", cluster = ~ district)))
cluster_se3 <- sqrt(diag(vcovCL(model3, type = "HC", cluster = ~ district)))
cluster_se4 <- sqrt(diag(vcovCL(model4, type = "HC", cluster = ~ district)))
cluster_se5 <- sqrt(diag(vcovCL(model5, type = "HC", cluster = ~ district)))
cluster_se6 <- sqrt(diag(vcovCL(model6, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}")
)

##### Make regression table ####

stargazer(
  model1,
  model2,
  model3,
  model4,
  model5,
  model6,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6),
  dep.var.labels = c("Lethal violence", "Injurious violence", "Material destruction"),
  model.names = FALSE,
  covariate.labels = c(
    "Security force involvement",
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("election", "Constant"),
  title = "Regression of security force involvement and electoral violence",
  label = "tab:regression",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c("Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"),
  type = "text"
  # type = "latex",
  # out = "table_1.tex"
)

##### Test that the difference in coefficients is statistically significant ####

vcov_cluster <- vcovCL(model2, cluster = ~district)

# Test if police_involve = ppolice_involve

linearHypothesis(model2, "police_involve = ppolice_involve", vcov. = vcov_cluster)

# Test if police_involve = army_involve

linearHypothesis(model2, "police_involve = army_involve", vcov. = vcov_cluster)

##### Predicted values ####

# Predicted reduction in deaths with police involvement

predict_model2 <- predictions(
  model2, 
  newdata = datagrid(
    police_involve = c(0,1)
  ),
  by = "police_involve",
  vcov = sandwich::vcovCL(model2, cluster = ~ district)
)

predicted_value1 <- predict_model2 %>% 
  as.data.frame() %>% 
  filter(police_involve == 0) %>% 
  mutate(estimate = round(estimate, 2)) %>% 
  pull(estimate)

predicted_value2 <- predict_model2 %>% 
  as.data.frame() %>% 
  filter(police_involve == 1) %>% 
  mutate(estimate = round(estimate, 2)) %>% 
  pull(estimate)

predicted_value1

predicted_value2

round((predicted_value2-predicted_value1)/predicted_value1 * -100, 0)

round(mean(maverick_sf$deaths_best), 2)

median(maverick_sf$deaths_best)

# Predicted increase in deaths with paramilitary police involvement

predict_model2 <- predictions(
  model2,
  newdata = datagrid(
    ppolice_involve = c(0,1)
  ),
  by = "ppolice_involve",
  vcov = sandwich::vcovCL(model4, cluster = ~ country_year)
)

predicted_value3 <- predict_model2 %>%
  as.data.frame() %>%
  filter(ppolice_involve == 0) %>%
  mutate(estimate = round(estimate, 2)) %>%
  pull(estimate)

predicted_value4 <- predict_model2 %>%
  as.data.frame() %>%
  filter(ppolice_involve == 1) %>%
  mutate(estimate = round(estimate, 2)) %>%
  pull(estimate)

predicted_value3

predicted_value4

round((predicted_value4-predicted_value3)/predicted_value3 * 100, 0)

# Predicted reduction in the risk of material destruction 

predict_model6 <- predictions(
  model6, 
  newdata = datagrid(
    police_involve = c(0,1)
  ),
  by = "police_involve",
  vcov = sandwich::vcovCL(model6, cluster = ~ district)
)

predict_model6 %>% 
  as.data.frame() %>% 
  filter(police_involve == 0) %>% 
  mutate(estimate = round(estimate * 100, 0)) %>% 
  pull(estimate)

predict_model6 %>% 
  as.data.frame() %>% 
  filter(police_involve == 1) %>% 
  mutate(estimate = round(estimate * 100, 0)) %>% 
  pull(estimate)

##### Clean working environment ####

rm(
  model1, model2, model3, model4, model5, model6,
  cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6,
  additional_lines, predict_model2, predict_model6,
  predicted_value1, predicted_value2, predicted_value3, 
  predicted_value4, vcov_cluster
)

#### Accounting for aggregation sensitivity ####

##### Prepare most-informative dataset ####

# Make dataset spatial

maverick_sf_inf <- maverick_inf %>%
  filter(geo_precision > 2 & !is.na(latitude) & !is.na(longitude) & latitude != 0 & longitude != 0) %>% 
  mutate(year = year(as.Date(date_start, format = "%Y-%m-%d"))) %>% 
  mutate(country_year = paste(country, year, sep = "_")) %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326, remove = FALSE)

# Add district names from shapefiles

maverick_sf_inf <- maverick_sf_inf %>%
  st_join(shapefile_civ %>% select(ADM1_FR), left = TRUE) %>% 
  st_join(shapefile_ke %>% select(shapeName), left = TRUE) %>% 
  mutate(
    district = case_when(
      country == "Ivory Coast" ~ ADM1_FR,
      TRUE ~ shapeName
    )
  ) %>%
  filter(!is.na(district)) %>% 
  select(-ADM1_FR, -shapeName)

# Join dataset with PRIO-Grid dataset

maverick_sf_inf <- maverick_sf_inf %>% 
  st_join(shapefile_prio_grid)

# Create variables for analysis 

maverick_sf_inf <- maverick_sf_inf %>% 
  # Create security force involvement variable
  mutate(
    sf_involve = case_when(
      actor1_type == "Security forces" & actor1_bystander != 1 ~ 1,
      actor2_type == "Security forces" & actor2_bystander != 1 ~ 1,
      actor3_type == "Security forces" & actor3_bystander != 1 ~ 1,
      actor4_type == "Security forces" & actor4_bystander != 1 ~ 1,
      actor5_type == "Security forces" & actor5_bystander != 1 ~ 1,
      actor6_type == "Security forces" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    police_involve = case_when(
      actor1_subtype == "Security forces: Police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    army_involve = case_when(
      actor1_subtype == "Security forces: Army" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Army" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Army" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Army" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Army" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Army" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    ppolice_involve = case_when(
      actor1_subtype == "Security forces: Paramilitary police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Paramilitary police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Paramilitary police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Paramilitary police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Paramilitary police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Paramilitary police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  )

# Convert the variables to factors

maverick_sf_inf$election <- as.factor(maverick_sf_inf$election)

# Set the reference categories 

maverick_sf_inf$election <- relevel(maverick_sf_inf$election, ref = "CI-Presidential-1995")

##### Prepare most-conservative dataset ####

# Make dataset spatial

maverick_sf_con <- maverick_con %>%
  filter(geo_precision > 2 & !is.na(latitude) & !is.na(longitude) & latitude != 0 & longitude != 0) %>% 
  mutate(year = year(as.Date(date_start, format = "%Y-%m-%d"))) %>% 
  mutate(country_year = paste(country, year, sep = "_")) %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326, remove = FALSE)

# Add district names from shapefiles

maverick_sf_con <- maverick_sf_con %>%
  st_join(shapefile_civ %>% select(ADM1_FR), left = TRUE) %>% 
  st_join(shapefile_ke %>% select(shapeName), left = TRUE) %>% 
  mutate(
    district = case_when(
      country == "Ivory Coast" ~ ADM1_FR,
      TRUE ~ shapeName
    )
  ) %>%
  filter(!is.na(district)) %>% 
  select(-ADM1_FR, -shapeName)

# Join dataset with PRIO-Grid dataset

maverick_sf_con <- maverick_sf_con %>% 
  st_join(shapefile_prio_grid)

# Create variables for analysis 

maverick_sf_con <- maverick_sf_con %>% 
  # Create security force involvement variable
  mutate(
    sf_involve = case_when(
      actor1_type == "Security forces" & actor1_bystander != 1 ~ 1,
      actor2_type == "Security forces" & actor2_bystander != 1 ~ 1,
      actor3_type == "Security forces" & actor3_bystander != 1 ~ 1,
      actor4_type == "Security forces" & actor4_bystander != 1 ~ 1,
      actor5_type == "Security forces" & actor5_bystander != 1 ~ 1,
      actor6_type == "Security forces" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    police_involve = case_when(
      actor1_subtype == "Security forces: Police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    army_involve = case_when(
      actor1_subtype == "Security forces: Army" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Army" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Army" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Army" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Army" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Army" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    ppolice_involve = case_when(
      actor1_subtype == "Security forces: Paramilitary police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Paramilitary police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Paramilitary police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Paramilitary police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Paramilitary police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Paramilitary police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  )

# Convert the variables to factors

maverick_sf_con$election <- as.factor(maverick_sf_con$election)

# Set the reference categories 

maverick_sf_con$election <- relevel(maverick_sf_con$election, ref = "CI-Presidential-1995")

##### Model 7 ####

model7 <- glm.nb(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf_inf, link = "log")

# Calculate standard errors clustered by district

coeftest(model7, vcov = vcovCL(model7, cluster = ~ district))

##### Model 8 ####

model8 <- glm.nb(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf_con, link = "log")

# Calculate standard errors clustered by district

coeftest(model8, vcov = vcovCL(model8, cluster = ~ district))

##### Model 9 ####

model9 <- glm.nb(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf_inf, link = "log")

# Calculate standard errors clustered by district

coeftest(model9, vcov = vcovCL(model9, cluster = ~ district))

##### Model 10 ####

model10 <- glm.nb(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf_con, link = "log")

# Calculate standard errors clustered by district

coeftest(model10, vcov = vcovCL(model10, cluster = ~ district))

##### Model 11 ####

model11 <- glm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf_inf, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model11, vcov = vcovCL(model11, cluster = ~ district))

##### Model 12 ####

model12 <- glm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf_con, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model12, vcov = vcovCL(model12, cluster = ~ district))

# Create lists of clustered standard errors

cluster_se7 <- sqrt(diag(vcovCL(model7, type = "HC", cluster = ~ district)))
cluster_se8 <- sqrt(diag(vcovCL(model8, type = "HC", cluster = ~ district)))
cluster_se9 <- sqrt(diag(vcovCL(model9, type = "HC", cluster = ~ district)))
cluster_se10 <- sqrt(diag(vcovCL(model10, type = "HC", cluster = ~ district)))
cluster_se11 <- sqrt(diag(vcovCL(model11, type = "HC", cluster = ~ district)))
cluster_se12 <- sqrt(diag(vcovCL(model12, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-inf.}", "\\text{Most-con.}", "\\text{Most-inf.}", "\\text{Most-con.}", "\\text{Most-inf.}", "\\text{Most-con.}")
)

##### Make regression table ####

stargazer(
  model7,
  model8,
  model9,
  model10,
  model11,
  model12,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se7, cluster_se8, cluster_se9, cluster_se10, cluster_se11, cluster_se12),
  dep.var.labels = c("Lethal violence", "Injurious violence", "Material destruction"),
  column.labels = c("(7)", "(8)", "(9)", "(10)", "(11)", "(12)"),
  model.names = FALSE,
  model.numbers = FALSE,
  covariate.labels = c(
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("election", "Constant"),
  title = "Accounting for aggregation sensitivity",
  label = "tab:regression-aggregation",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c("Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"),
  type = "text"
  # type = "latex",
  # out = "table_d1.tex"
)

##### Clean working environment ####

rm(
  model7, model8, model9, model10, model11, model12,
  cluster_se7, cluster_se8, cluster_se9, cluster_se10, cluster_se11, cluster_se12,
  additional_lines, maverick_sf_con, maverick_sf_inf
)

#### Using inclusion certainty weights ####

# Create normalized certainty weights

maverick_sf$certain_norm <- maverick_sf$certain / mean(maverick_sf$certain)

summary(maverick_sf$certain_norm)

##### Model 13 ####

model13 <- glm.nb(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, link = "log", weights = certain_norm, control = glm.control(maxit = 200))

# Calculate standard errors clustered by country

coeftest(model13, vcov = vcovCL(model13, cluster = ~ country))

##### Model 14 ####

model14 <- glm.nb(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, link = "log", weights = certain_norm)

# Calculate standard errors clustered by country

coeftest(model14, vcov = vcovCL(model14, cluster = ~ country))

##### Model 15 ####

model15 <- glm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, family = "binomial", weights = certain_norm)

# Calculate standard errors clustered by country

coeftest(model15, vcov = vcovCL(model15, cluster = ~ country))

# Create lists of clustered standard errors

cluster_se13 <- sqrt(diag(vcovCL(model13, type = "HC", cluster = ~ district)))
cluster_se14 <- sqrt(diag(vcovCL(model14, type = "HC", cluster = ~ district)))
cluster_se15 <- sqrt(diag(vcovCL(model15, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Inclusion certainty weights", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}")
)

##### Make regression table ####

stargazer(
  model13,
  model14,
  model15,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se13, cluster_se14, cluster_se15),
  dep.var.labels = c("Lethal violence", "Injurious violence", "Material destruction"),
  column.labels = c("(13)", "(14)", "(15)"),
  model.names = FALSE,
  model.numbers = FALSE,
  covariate.labels = c(
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("election", "Constant"),
  title = "Using inclusion certainty weights",
  label = "tab:regression-certainty",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c("All models include inclusion certainty weights.", "Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"),
  type = "text"
  # type = "latex",
  # out = "table_d2.tex"
)

##### Clean working environment ####

rm(
  model13, model14, model15,
  cluster_se13, cluster_se14, cluster_se15, 
  additional_lines
)

#### Omitting events with missing actor information ####

# Exclude events that include Unknown or Indeterminate actors in actor*_type

maverick_uncertain <- maverick_sf %>% 
  mutate(
    uncertain = case_when(
      actor1_type %in% c("", "Indeterminate") & actor1_bystander != 1 ~ 1,
      actor2_type %in% c("", "Indeterminate") & actor2_bystander != 1 ~ 1,
      actor3_type %in% c("", "Indeterminate") & actor3_bystander != 1 ~ 1,
      actor4_type %in% c("", "Indeterminate") & actor4_bystander != 1 ~ 1,
      actor5_type %in% c("", "Indeterminate") & actor5_bystander != 1 ~ 1,
      actor6_type %in% c("", "Indeterminate") & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  ) %>% 
  filter(uncertain == 0)

# Check that the sub-setting worked

maverick_sf %>% 
  mutate(
    uncertain = case_when(
      actor1_type %in% c("", "Indeterminate") & actor1_bystander != 1 ~ 1,
      actor2_type %in% c("", "Indeterminate") & actor2_bystander != 1 ~ 1,
      actor3_type %in% c("", "Indeterminate") & actor3_bystander != 1 ~ 1,
      actor4_type %in% c("", "Indeterminate") & actor4_bystander != 1 ~ 1,
      actor5_type %in% c("", "Indeterminate") & actor5_bystander != 1 ~ 1,
      actor6_type %in% c("", "Indeterminate") & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  ) %>% 
  filter(uncertain == 1) %>% 
  select(uncertain, actor1_type, actor2_type, actor3_type, actor4_type, actor5_type, actor6_type)

# Exclude events that include Unknown or Indeterminate actors in actor*_subtype

maverick_uncertain2 <- maverick_sf %>% 
  mutate(
    uncertain = case_when(
      actor1_subtype %in% c("", "Indeterminate") & actor1_bystander != 1 ~ 1,
      actor2_subtype %in% c("", "Indeterminate") & actor2_bystander != 1 ~ 1,
      actor3_subtype %in% c("", "Indeterminate") & actor3_bystander != 1 ~ 1,
      actor4_subtype %in% c("", "Indeterminate") & actor4_bystander != 1 ~ 1,
      actor5_subtype %in% c("", "Indeterminate") & actor5_bystander != 1 ~ 1,
      actor6_subtype %in% c("", "Indeterminate") & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  ) %>% 
  filter(uncertain == 0)

# Check that the sub-setting worked

maverick_sf %>% 
  mutate(
    uncertain = case_when(
      actor1_subtype %in% c("", "Indeterminate") & actor1_bystander != 1 ~ 1,
      actor2_subtype %in% c("", "Indeterminate") & actor2_bystander != 1 ~ 1,
      actor3_subtype %in% c("", "Indeterminate") & actor3_bystander != 1 ~ 1,
      actor4_subtype %in% c("", "Indeterminate") & actor4_bystander != 1 ~ 1,
      actor5_subtype %in% c("", "Indeterminate") & actor5_bystander != 1 ~ 1,
      actor6_subtype %in% c("", "Indeterminate") & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  ) %>% 
  filter(uncertain == 1) %>% 
  select(uncertain, actor1_subtype, actor2_subtype, actor3_subtype, actor4_subtype, actor5_subtype, actor6_subtype)

##### Model 16 ####

model1 <- glm.nb(deaths_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_uncertain, link = "log")

# Calculate standard errors clustered by district

coeftest(model1, vcov = vcovCL(model1, cluster = ~ district))

##### Model 17 ####

model2 <- glm.nb(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_uncertain2, link = "log")

# Calculate standard errors clustered by district

coeftest(model2, vcov = vcovCL(model2, cluster = ~ district))

##### Model 18 ####

model3 <- glm.nb(injuries_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_uncertain, link = "log")

# Calculate standard errors clustered by district

coeftest(model3, vcov = vcovCL(model3, cluster = ~ district))

##### Model 19 ####

model4 <- glm.nb(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_uncertain2, link = "log")

# Calculate standard errors clustered by district

coeftest(model4, vcov = vcovCL(model4, cluster = ~ district))

##### Model 20 ####

model5 <- glm(damage ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_uncertain, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model5, vcov = vcovCL(model5, cluster = ~ district))

##### Model 21 ####

model6 <- glm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_uncertain2, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model6, vcov = vcovCL(model6, cluster = ~ district))

# Create lists of clustered standard errors

cluster_se1 <- sqrt(diag(vcovCL(model1, type = "HC", cluster = ~ district)))
cluster_se2 <- sqrt(diag(vcovCL(model2, type = "HC", cluster = ~ district)))
cluster_se3 <- sqrt(diag(vcovCL(model3, type = "HC", cluster = ~ district)))
cluster_se4 <- sqrt(diag(vcovCL(model4, type = "HC", cluster = ~ district)))
cluster_se5 <- sqrt(diag(vcovCL(model5, type = "HC", cluster = ~ district)))
cluster_se6 <- sqrt(diag(vcovCL(model6, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}")
)

##### Make regression table ####

stargazer(
  model1,
  model2,
  model3,
  model4,
  model5,
  model6,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6),
  dep.var.labels = c("Lethal violence", "Injurious violence", "Material destruction"),
  column.labels = c("(16)", "(17)", "(18)", "(19)", "(20)", "(21)"),
  model.names = FALSE,
  model.numbers = FALSE,
  covariate.labels = c(
    "Security force involvement",
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("Constant", "election"),
  title = "Omitting events with missing actor information",
  label = "tab:regression-actors",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c(
    "Models 16, 18, and 20 omit events that include at least one unknown actor type.",
    "Models 17, 19, and 21 omit events that include at least one unknown actor subtype.",
    "Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"
  ),
  type = "text"
  # type = "latex",
  # out = "table_d3.tex"
)

##### Test that the difference in coefficients is statistically significant ####

vcov_cluster <- vcovCL(model2, cluster = ~district)

# Test if police_involve = ppolice_involve

linearHypothesis(model2, "police_involve = ppolice_involve", vcov. = vcov_cluster)

# Test if police_involve = army_involve

linearHypothesis(model2, "police_involve = army_involve", vcov. = vcov_cluster)

##### Clean working environment ####

rm(
  model1, model2, model3, model4, model5, model6,
  cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6, vcov_cluster,
  additional_lines, maverick_uncertain, maverick_uncertain2
)

#### Including only events sourced from news sources ####

# Subset event report data to only include sources from Factiva

maverick_media <- maverick %>% 
  filter(
    source_type %in% c(
      "International news agency",
      "International news media",
      "National news media"
    )
  )

maverick_reports <- maverick %>% 
  filter(
    source_type %in% c(
      "Human rights commission",
      "International civil society organization",
      "International organization",
      "National civil society organization",
      "National government agency",
      "Other",
      "Special commission"
    )
  )

# Count number of reports in the filtered dataset

writeLines(paste0(format(nrow(maverick_media), big.mark = ","), "%"), "/Users/sebastian/Library/CloudStorage/Dropbox/Appar/Overleaf/Introducing the MAVERICK Dataset/number_of_event_reports_media.tex")

# Re-specify the most-representative aggregation model

revised_aggregation <- function(data) {
  aggregateData(
    data = data,
    group_var = "event_id",
    find_mode = c(
      "country", "election", "city", "location",
      "actor1", "actor1_id", "actor1_type", "actor1_subtype", "actor1_party", "actor1_violence",
      "actor2", "actor2_id", "actor2_type", "actor2_subtype", "actor2_party", "actor2_violence",
      "actor3", "actor3_id", "actor3_type", "actor3_subtype", "actor3_party", "actor3_violence",
      "actor4", "actor4_id", "actor4_type", "actor4_subtype", "actor4_party", "actor4_violence",
      "actor5", "actor5_id", "actor5_type", "actor5_subtype", "actor5_party", "actor5_violence",
      "actor6", "actor6_id", "actor6_type", "actor6_subtype", "actor6_party", "actor6_violence",
      "event_context", "target"
    ),
    find_mode_date = c("date_start", "date_end"),
    find_mode_numeric = c(
      "latitude", "longitude", "geo_precision",
      "actor1_victim", "actor2_victim", "actor3_victim", "actor4_victim", "actor5_victim", "actor6_victim",
      "deaths_best", "deaths_low", "deaths_high", "injuries_best", "injuries_low", "injuries_high"
    ),
    find_mode_bin = c("displacement", "damage"),
    summarize_vars = c(
      "actor1_initiator", "actor1_perpetrator", "actor1_intervener", "actor1_bystander",
      "actor2_initiator", "actor2_perpetrator", "actor2_intervener", "actor2_bystander",
      "actor3_initiator", "actor3_perpetrator", "actor3_intervener", "actor3_bystander",
      "actor4_initiator", "actor4_perpetrator", "actor4_intervener", "actor4_bystander",
      "actor5_initiator", "actor5_perpetrator", "actor5_intervener", "actor5_bystander",
      "actor6_initiator", "actor6_perpetrator", "actor6_intervener", "actor6_bystander"
    ),
    combine_strings = "source",
    find_max = c("certain1", "certain2", "certain3", "certain4", "certain5", "certain6"),
    tie_break = "source_classification",
    second_tie_break = "certain",
    aggregation_name = "Representative"
  ) %>%
    mutate(certain = certain1 + certain2 + certain3 + certain4 + certain5 + certain6) %>%
    rowwise() %>%
    mutate(
      across(
        matches("^actor[1-6]_initiator$"),
        ~ {
          corresponding_actor <- get(gsub("_initiator", "", cur_column()))
          if (is.na(corresponding_actor)) {
            NA_integer_
          } else {
            max_value <- max(c_across(matches("^actor[1-6]_initiator$")), na.rm = TRUE)
            num_max <- sum(c_across(matches("^actor[1-6]_initiator$")) == max_value, na.rm = TRUE)
            ifelse(. == max_value & max_value > 0 & num_max == 1, 1, 0)
          }
        }
      ),
      across(
        matches("^actor[1-6]_perpetrator$"),
        ~ {
          corresponding_actor <- get(gsub("_perpetrator", "", cur_column()))
          if (is.na(corresponding_actor)) {
            NA_integer_
          } else {
            ifelse(. > 0 & get(gsub("_perpetrator", "_bystander", cur_column())) < 1, 1, 0)
          }
        }
      ),
      across(
        matches("^actor[1-6]_intervener$"),
        ~ {
          corresponding_actor <- get(gsub("_intervener", "", cur_column()))
          if (is.na(corresponding_actor)) {
            NA_integer_
          } else {
            ifelse(. > 0 & get(gsub("_intervener", "_initiator", cur_column())) == 0, 1, 0)
          }
        }
      ),
      across(
        matches("^actor[1-6]_bystander$"),
        ~ {
          corresponding_actor <- get(gsub("_bystander", "", cur_column()))
          if (is.na(corresponding_actor)) {
            NA_integer_
          } else {
            ifelse(
              . > 0 &
                get(gsub("_bystander", "_initiator", cur_column())) == 0 &
                get(gsub("_bystander", "_perpetrator", cur_column())) == 0 &
                get(gsub("_bystander", "_intervener", cur_column())) == 0, 1, 0
            )
          }
        }
      )
    ) %>%
    ungroup() %>%
    mutate(
      actor1_violence = case_when(actor1_violence == "NA" ~ NA_character_, TRUE ~ actor1_violence),
      actor2_violence = case_when(actor2_violence == "NA" ~ NA_character_, TRUE ~ actor2_violence),
      actor3_violence = case_when(actor3_violence == "NA" ~ NA_character_, TRUE ~ actor3_violence),
      actor4_violence = case_when(actor4_violence == "NA" ~ NA_character_, TRUE ~ actor4_violence),
      actor5_violence = case_when(actor5_violence == "NA" ~ NA_character_, TRUE ~ actor5_violence),
      actor6_violence = case_when(actor6_violence == "NA" ~ NA_character_, TRUE ~ actor6_violence)
    ) %>%
    select(
      event_id:election, date_start, date_end, city, location, latitude, longitude, geo_precision,
      actor1, actor1_id, actor1_type, actor1_subtype, actor1_party, actor1_violence, actor1_initiator, actor1_perpetrator, actor1_intervener, actor1_bystander, actor1_victim,
      actor2, actor2_id, actor2_type, actor2_subtype, actor2_party, actor2_violence, actor2_initiator, actor2_perpetrator, actor2_intervener, actor2_bystander, actor2_victim,
      actor3, actor3_id, actor3_type, actor3_subtype, actor3_party, actor3_violence, actor3_initiator, actor3_perpetrator, actor3_intervener, actor3_bystander, actor3_victim,
      actor4, actor4_id, actor4_type, actor4_subtype, actor4_party, actor4_violence, actor4_initiator, actor4_perpetrator, actor4_intervener, actor4_bystander, actor4_victim,
      actor5, actor5_id, actor5_type, actor5_subtype, actor5_party, actor5_violence, actor5_initiator, actor5_perpetrator, actor5_intervener, actor5_bystander, actor5_victim,
      actor6, actor6_id, actor6_type, actor6_subtype, actor6_party, actor6_violence, actor6_initiator, actor6_perpetrator, actor6_intervener, actor6_bystander, actor6_victim,
      event_context:damage, deaths_best:injuries_high,
      certain, certain1:certain6, source, number_of_sources:aggregation
    )
}

# Aggregate filtered event report data using the most-representative aggregation procedure

maverick_rep_media <- revised_aggregation(maverick_media)

maverick_rep_reports <- revised_aggregation(maverick_reports)

##### Figure D1: Share of actor records by actor subtype, country, and source mix ####

maverick_rep_media <- maverick_rep_media %>% 
  mutate(source_mix = "Media sources")

maverick_rep_reports <- maverick_rep_reports %>% 
  mutate(source_mix = "Reports")

maverick_rep_media %>% 
  mutate(source_mix = "Media sources") %>% 
  add_row(maverick_rep_reports) %>% 
  select(
    event_id, country, actor1_subtype, actor2_subtype, actor3_subtype,
    actor4_subtype, actor5_subtype, actor6_subtype, source_mix
  ) %>% 
  pivot_longer(
    cols = c(-event_id, -country, -source_mix),
    names_to = "actor",
    values_to = "actor_subtype"
  ) %>% 
  filter(!is.na(actor_subtype)) %>% 
  mutate(
    actor_subtype = case_when(
      actor_subtype == "Other" ~ "Other/Unknown",
      actor_subtype == "Indeterminate" ~ "Other/Unknown",
      actor_subtype == "" ~ "Other/Unknown",
      TRUE ~ actor_subtype
    )
  ) %>% 
  group_by(actor_subtype, country, source_mix) %>%
  summarize(count = n(), .groups = "drop") %>%
  group_by(country) %>%
  mutate(proportion = count / sum(count)) %>% 
  ungroup() %>%
  ggplot() +
  geom_linerange(
    aes(x = actor_subtype, ymin = 0, ymax = proportion, color = source_mix),
    position = position_dodge(width = 1)
  ) +
  geom_point(
    aes(x = actor_subtype, y = proportion, color = source_mix), 
    size = 3, position = position_dodge(width = 1)
  ) +
  facet_wrap(~ country) +
  geom_text(
    aes(
      x = actor_subtype, y = proportion + 0.02, color = source_mix,
      label = paste0(round(proportion*100,0), "%")
    ),
    size = 4, position = position_dodge(width = 1)
  ) +
  scale_y_continuous(labels = scales::percent_format(), breaks = seq(0, 0.4, 0.05)) +
  coord_flip() +
  labs(
    x = NULL,
    y = "Share of actor records"
  ) +
  scale_color_manual(name = "Source mix", values = rev(wes_palette("Zissou1", 2, type = "continuous"))) +
  my_theme +
  theme(
    axis.text.y = element_text(size = 16),
    legend.position = "top",
    legend.justification = c(0, 1),
    legend.box.just = "left",
    legend.margin = margin(0, 0, 0, 0)
  )

ggsave("figure_d1.pdf", width = 13, height = 10, dpi = 600)

# Make dataset spatial

maverick_rep_media <- maverick_rep_media %>%
  filter(geo_precision > 2 & !is.na(latitude) & !is.na(longitude) & latitude != 0 & longitude != 0) %>% 
  mutate(year = year(as.Date(date_start, format = "%Y-%m-%d"))) %>% 
  mutate(country_year = paste(country, year, sep = "_")) %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326, remove = FALSE)

# Add district names from shapefiles

maverick_rep_media <- maverick_rep_media %>%
  st_join(shapefile_civ %>% select(ADM1_FR), left = TRUE) %>% 
  st_join(shapefile_ke %>% select(shapeName), left = TRUE) %>% 
  mutate(
    district = case_when(
      country == "Ivory Coast" ~ ADM1_FR,
      TRUE ~ shapeName
    )
  ) %>%
  filter(!is.na(district)) %>% 
  select(-ADM1_FR, -shapeName)

# Join dataset with PRIO-Grid dataset

maverick_rep_media <- maverick_rep_media %>% 
  st_join(shapefile_prio_grid)

# Create variables for analysis 

maverick_rep_media <- maverick_rep_media %>% 
  # Create security force involvement variable
  mutate(
    event_context = case_when(
      event_context == "Canvasing" ~ "Other context",
      event_context == "Incumbent campaign event" ~ "Campaign event",
      event_context == "Arrest or raid" ~ "Military operation",
      event_context == "Opposition campaign event" ~ "Campaign event",
      event_context == "Riot" ~ "Protest or riot",
      event_context == "Protest" ~ "Protest or riot",
      event_context == "Security force crack-down on protest" ~ "Protest or riot",
      event_context == "Vote counting" ~ "Voting procedures",
      event_context == "Voting" ~ "Voting procedures",
      event_context == "Indeterminate" ~ "Unknown",
      TRUE ~ event_context
    ),
    sf_involve = case_when(
      actor1_type == "Security forces" & actor1_bystander != 1 ~ 1,
      actor2_type == "Security forces" & actor2_bystander != 1 ~ 1,
      actor3_type == "Security forces" & actor3_bystander != 1 ~ 1,
      actor4_type == "Security forces" & actor4_bystander != 1 ~ 1,
      actor5_type == "Security forces" & actor5_bystander != 1 ~ 1,
      actor6_type == "Security forces" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    police_involve = case_when(
      actor1_subtype == "Security forces: Police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    army_involve = case_when(
      actor1_subtype == "Security forces: Army" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Army" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Army" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Army" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Army" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Army" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    ),
    ppolice_involve = case_when(
      actor1_subtype == "Security forces: Paramilitary police" & actor1_bystander != 1 ~ 1,
      actor2_subtype == "Security forces: Paramilitary police" & actor2_bystander != 1 ~ 1,
      actor3_subtype == "Security forces: Paramilitary police" & actor3_bystander != 1 ~ 1,
      actor4_subtype == "Security forces: Paramilitary police" & actor4_bystander != 1 ~ 1,
      actor5_subtype == "Security forces: Paramilitary police" & actor5_bystander != 1 ~ 1,
      actor6_subtype == "Security forces: Paramilitary police" & actor6_bystander != 1 ~ 1,
      TRUE ~ 0
    )
  )

# Convert the variables to factors

maverick_rep_media$election <- as.factor(maverick_rep_media$election)

# Set the reference categories 

maverick_rep_media$election <- relevel(maverick_rep_media$election, ref = "CI-Presidential-1995")

# Remove events in elections with fewer than 10 violent events or for which the election is unknown

maverick_rep_media <- maverick_rep_media %>% 
  group_by(election) %>% 
  mutate(count = n()) %>% 
  ungroup() %>% 
  filter(count >= 10 & election != "Other" & election != "Indeterminate") 

##### Model 22 ####

model1 <- glm.nb(deaths_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_rep_media, link = "log")

# Calculate standard errors clustered by district

coeftest(model1, vcov = vcovCL(model1, cluster = ~ district))

##### Model 23####

model2 <- glm.nb(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_rep_media, link = "log")

# Calculate standard errors clustered by district

coeftest(model2, vcov = vcovCL(model2, cluster = ~ district))

##### Model 24 ####

model3 <- glm.nb(injuries_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_rep_media, link = "log")

# Calculate standard errors clustered by district

coeftest(model3, vcov = vcovCL(model3, cluster = ~ district))

##### Model 25 ####

model4 <- glm.nb(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_rep_media, link = "log")

# Calculate standard errors clustered by district

coeftest(model4, vcov = vcovCL(model4, cluster = ~ district))

##### Model 26 ####

model5 <- glm(damage ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_rep_media, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model5, vcov = vcovCL(model5, cluster = ~ district))

##### Model 27 ####

model6 <- glm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_rep_media, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model6, vcov = vcovCL(model6, cluster = ~ district))

# Create lists of clustered standard errors

cluster_se1 <- sqrt(diag(vcovCL(model1, type = "HC", cluster = ~ district)))
cluster_se2 <- sqrt(diag(vcovCL(model2, type = "HC", cluster = ~ district)))
cluster_se3 <- sqrt(diag(vcovCL(model3, type = "HC", cluster = ~ district)))
cluster_se4 <- sqrt(diag(vcovCL(model4, type = "HC", cluster = ~ district)))
cluster_se5 <- sqrt(diag(vcovCL(model5, type = "HC", cluster = ~ district)))
cluster_se6 <- sqrt(diag(vcovCL(model6, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}")
)

##### Make regression table ####

stargazer(
  model1,
  model2,
  model3,
  model4,
  model5,
  model6,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6),
  dep.var.labels = c("Lethal violence", "Injurious violence", "Material destruction"),
  column.labels = c("(22)", "(23)", "(24)", "(25)", "(26)", "(27)"),
  model.names = FALSE,
  model.numbers = FALSE,
  covariate.labels = c(
    "Security force involvement",
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("election", "Constant"),
  title = "Using only events sourced media reports",
  label = "tab:regression-media",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c(
    "The models include only events aggregated based on media event reports.",
    "Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"
  ),
  type = "text"
  # type = "latex",
  # out = "table_d4.tex"
)

##### Test that the difference in coefficients is statistically significant ####

vcov_cluster <- vcovCL(model2, cluster = ~district)

# Test if police_involve = ppolice_involve

linearHypothesis(model2, "police_involve = ppolice_involve", vcov. = vcov_cluster)

# Test if police_involve = army_involve

linearHypothesis(model2, "police_involve = army_involve", vcov. = vcov_cluster)

##### Clean working environment ####

rm(
  model1, model2, model3, model4, model5, model6,
  cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6, vcov_cluster,
  additional_lines, maverick_media, maverick_rep_media, maverick_reports, maverick_rep_reports, revised_aggregation
)

#### Accounting for event context ####

##### Model 28 ####

model1 <- glm.nb(deaths_best ~ sf_involve + agri_gc + ttime_mean + election + event_context, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model1, vcov = vcovCL(model1, cluster = ~ district))

##### Model 29 ####

model2 <- glm.nb(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election + event_context, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model2, vcov = vcovCL(model2, cluster = ~ district))

##### Model 30 ####

model3 <- glm.nb(injuries_best ~ sf_involve + agri_gc + ttime_mean + election + event_context, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model3, vcov = vcovCL(model3, cluster = ~ district))

##### Model 31 ####

model4 <- glm.nb(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election + event_context, data = maverick_sf, link = "log")

# Calculate standard errors clustered by district

coeftest(model4, vcov = vcovCL(model4, cluster = ~ district))

##### Model 32 ####

model5 <- glm(damage ~ sf_involve + agri_gc + ttime_mean + election + event_context, data = maverick_sf, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model5, vcov = vcovCL(model5, cluster = ~ district))

##### Model 33 ####

model6 <- glm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election + event_context, data = maverick_sf, family = "binomial")

# Calculate standard errors clustered by district

coeftest(model6, vcov = vcovCL(model6, cluster = ~ district))

# Create lists of clustered standard errors

cluster_se1 <- sqrt(diag(vcovCL(model1, type = "HC", cluster = ~ district)))
cluster_se2 <- sqrt(diag(vcovCL(model2, type = "HC", cluster = ~ district)))
cluster_se3 <- sqrt(diag(vcovCL(model3, type = "HC", cluster = ~ district)))
cluster_se4 <- sqrt(diag(vcovCL(model4, type = "HC", cluster = ~ district)))
cluster_se5 <- sqrt(diag(vcovCL(model5, type = "HC", cluster = ~ district)))
cluster_se6 <- sqrt(diag(vcovCL(model6, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Event context fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}")
)

##### Make regression table ####

stargazer(
  model1,
  model2,
  model3,
  model4,
  model5,
  model6,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6),
  dep.var.labels = c("Lethal violence", "Injurious violence", "Material destruction"),
  column.labels = c("(28)", "(29)", "(30)", "(31)", "(32)", "(33)"),
  model.names = FALSE,
  model.numbers = FALSE,
  covariate.labels = c(
    "Security force involvement",
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("election", "event_context", "Constant"),
  title = "Accounting for event context",
  label = "tab:regression-context",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c("Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"),
  type = "text"
  # type = "latex",
  # out = "table_e1.tex"
)

##### Test that the difference in coefficients is statistically significant ####

vcov_cluster <- vcovCL(model2, cluster = ~district)

# Test if police_involve = ppolice_involve

linearHypothesis(model2, "police_involve = ppolice_involve", vcov. = vcov_cluster)

# Test if police_involve = army_involve

linearHypothesis(model2, "police_involve = army_involve", vcov. = vcov_cluster)

##### Clean working environment ####

rm(
  model1, model2, model3, model4, model5, model6,
  cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6, vcov_cluster,
  additional_lines
)

#### Using Conley standard errors ####

##### Model 34 ####

model1 <- fenegbin(deaths_best ~ sf_involve + agri_gc + ttime_mean | election, data = maverick_sf)

# Calculate Conley standard errors

conley_se1 <- vcov_conley(model1, lat = ~latitude, lon = ~longitude, cutoff = 50)

summary(model1, vcov = conley_se1)

##### Model 35 ####

model2 <- fenegbin(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean | election, data = maverick_sf)

# Calculate Conley standard errors

conley_se2 <- vcov_conley(model2, lat = ~latitude, lon = ~longitude, cutoff = 50)

summary(model2, vcov = conley_se2)

##### Model 36 ####

model3 <- fenegbin(injuries_best ~ sf_involve + agri_gc + ttime_mean | election, data = maverick_sf)

# Calculate Conley standard errors

conley_se3 <- vcov_conley(model3, lat = ~latitude, lon = ~longitude, cutoff = 50)

summary(model3, vcov = conley_se3)

##### Model 37 ####

model4 <- fenegbin(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean | election, data = maverick_sf)

# Calculate Conley standard errors

conley_se4 <- vcov_conley(model4, lat = ~latitude, lon = ~longitude, cutoff = 50)

summary(model4, vcov = conley_se4)

##### Model 38 ####

model5 <- feglm(damage ~ sf_involve + agri_gc + ttime_mean | election, data = maverick_sf, family = "binomial")

# Calculate Conley standard errors

conley_se5 <- vcov_conley(model5, lat = ~latitude, lon = ~longitude, cutoff = 50)

summary(model5, vcov = conley_se5)

##### Model 39 ####

model6 <- feglm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean | election, data = maverick_sf, family = "binomial")

# Calculate Conley standard errors

conley_se6 <- vcov_conley(model6, lat = ~latitude, lon = ~longitude, cutoff = 50)

summary(model6, vcov = conley_se6)

# Rename co-variates 

dict <- c(
  sf_involve = "Security force involvement",
  police_involve = "Police involvement",
  ppolice_involve = "Paramililitary police involvement",
  army_involve = "Army involvement",
  agri_gc = "Agricultural land cover",
  ttime_mean = "Travel time to nearest urban center",
  deaths_best = "Lethal violence",
  injuries_best = "Injurious violence",
  damage = "Material destruction",
  election = "Election"
)

# Collect the Conley standard errors 

vcov_list <- list(
  vcov_conley(model1, lat = ~latitude, lon = ~longitude, cutoff = 100),
  vcov_conley(model2, lat = ~latitude, lon = ~longitude, cutoff = 100),
  vcov_conley(model3, lat = ~latitude, lon = ~longitude, cutoff = 100),
  vcov_conley(model4, lat = ~latitude, lon = ~longitude, cutoff = 100),
  vcov_conley(model5, lat = ~latitude, lon = ~longitude, cutoff = 100),
  vcov_conley(model6, lat = ~latitude, lon = ~longitude, cutoff = 100)
)

##### Make regression table ####

unlink("table_e2.tex")

table_tex <- etable(
  model1, model2, model3, model4, model5, model6,
  headers = list("^Model" = c("(34)", "(35)", "(36)", "(37)", "(38)", "(39)")),
  dict = dict,
  vcov = vcov_list,
  order = c(
    "Security force involvement",
    "Police involvement",
    "Paramililitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  digits = "r2",
  digits.stats = "r2",
  signif.code = c("**" = 0.01, "*" = 0.05, "+" = 0.10),
  fitstat = c("n", "r2", "aic"),
  placement = "t",
  family = FALSE,
  title = "Using Conley standard errors",
  label = "tab:regression-conley",
  fontsize = "scriptsize",
  se.below = TRUE,
  tex = TRUE
)

table_tex <- table_tex[!grepl("^\\s*Model:.*\\\\\\\\", table_tex)]

writeLines(table_tex, "table_e2.tex")

##### Clean working environment ####

rm(
  model1, model2, model3, model4, model5, model6,
  dict, vcov_list,
  conley_se1, conley_se2, conley_se3, conley_se4, conley_se5, conley_se6, table_tex
)

#### Using Poisson regression ####

##### Model 40 ####

model1 <- glm(deaths_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf, family = poisson(link = "log"))

# Calculate standard errors clustered by district

coeftest(model1, vcov = vcovCL(model1, cluster = ~ district))

##### Model 41 ####

model2 <- glm(deaths_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, family = poisson(link = "log"))

# Calculate standard errors clustered by district

coeftest(model2, vcov = vcovCL(model2, cluster = ~ district))

##### Model 42 ####

model3 <- glm(injuries_best ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf, family = poisson(link = "log"))

# Calculate standard errors clustered by district

coeftest(model3, vcov = vcovCL(model3, cluster = ~ district))

##### Model 43 ####

model4 <- glm(injuries_best ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf, family = poisson(link = "log"))

# Calculate standard errors clustered by district

coeftest(model4, vcov = vcovCL(model4, cluster = ~ district))

# Create lists of clustered standard errors

cluster_se1 <- sqrt(diag(vcovCL(model1, type = "HC", cluster = ~ district)))
cluster_se2 <- sqrt(diag(vcovCL(model2, type = "HC", cluster = ~ district)))
cluster_se3 <- sqrt(diag(vcovCL(model3, type = "HC", cluster = ~ district)))
cluster_se4 <- sqrt(diag(vcovCL(model4, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}")
)

##### Make regression table ####

stargazer(
  model1,
  model2,
  model3,
  model4,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se1, cluster_se2, cluster_se3, cluster_se4),
  dep.var.labels = c("Lethal violence", "Injurious violence"),
  column.labels = c("(40)", "(41)", "(42)", "(43)"),
  model.names = FALSE,
  model.numbers = FALSE,
  covariate.labels = c(
    "Security force involvement",
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("election", "Constant"),
  title = "Using Poisson regression",
  label = "tab:regression-poisson",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c("Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"),
  type = "text"
  # type = "latex",
  # out = "table_e3.tex"
)

##### Test that the difference in coefficients is statistically significant ####

vcov_cluster <- vcovCL(model2, cluster = ~district)

# Test if police_involve = ppolice_involve

linearHypothesis(model2, "police_involve = ppolice_involve", vcov. = vcov_cluster)

# Test if police_involve = army_involve

linearHypothesis(model2, "police_involve = army_involve", vcov. = vcov_cluster)

##### Clean working environment ####

rm(
  model1, model2, model3, model4,
  cluster_se1, cluster_se2, cluster_se3, cluster_se4, vcov_cluster,
  additional_lines
)

#### Using Linear Probability regression and binary outcome variables ####

# Create binary outcome variables

maverick_sf <- maverick_sf %>% 
  mutate(
    deaths_bin = case_when(deaths_best > 0 ~ 1, TRUE ~ 0),
    injuries_bin = case_when(injuries_best > 0 ~ 1, TRUE ~ 0)
  )

##### Model 44 ####

model1 <- lm(deaths_bin ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf)

# Calculate standard errors clustered by district

coeftest(model1, vcov = vcovCL(model1, cluster = ~district))

##### Model 45 ####

model2 <- lm(deaths_bin ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf)

# Calculate standard errors clustered by district

coeftest(model2, vcov = vcovCL(model2, cluster = ~district))

##### Model 46 ####

model3 <- lm(injuries_bin ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf)

# Calculate standard errors clustered by district

coeftest(model3, vcov = vcovCL(model3, cluster = ~district))

##### Model 47 ####

model4 <- lm(injuries_bin ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf)

# Calculate standard errors clustered by district

coeftest(model4, vcov = vcovCL(model4, cluster = ~district))

##### Model 48 ####

model5 <- lm(damage ~ sf_involve + agri_gc + ttime_mean + election, data = maverick_sf)

# Calculate standard errors clustered by district

coeftest(model5, vcov = vcovCL(model5, cluster = ~district))

##### Model 49 ####

model6 <- lm(damage ~ police_involve + ppolice_involve + army_involve + agri_gc + ttime_mean + election, data = maverick_sf)

# Calculate standard errors clustered by district

coeftest(model6, vcov = vcovCL(model6, cluster = ~district))

# Create lists of clustered standard errors

cluster_se1 <- sqrt(diag(vcovCL(model1, type = "HC", cluster = ~ district)))
cluster_se2 <- sqrt(diag(vcovCL(model2, type = "HC", cluster = ~ district)))
cluster_se3 <- sqrt(diag(vcovCL(model3, type = "HC", cluster = ~ district)))
cluster_se4 <- sqrt(diag(vcovCL(model4, type = "HC", cluster = ~ district)))
cluster_se5 <- sqrt(diag(vcovCL(model5, type = "HC", cluster = ~ district)))
cluster_se6 <- sqrt(diag(vcovCL(model6, type = "HC", cluster = ~ district)))

# Format additional lines

additional_lines <- list(
  c("Election fixed effects", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}", "\\text{Yes}"),
  c("Aggregation", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}", "\\text{Most-rep.}")
)

##### Make regression table ####

stargazer(
  model1,
  model2,
  model3,
  model4,
  model5,
  model6,
  digits = 2,
  single.row = FALSE,
  se = list(cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6),
  dep.var.labels = c("Lethal violence", "Injurious violence", "Material destruction"),
  column.labels = c("(44)", "(45)", "(46)", "(47)", "(48)", "(49)"),
  model.names = FALSE,
  model.numbers = FALSE,
  covariate.labels = c(
    "Security force involvement",
    "Police involvement",
    "Paramilitary police involvement",
    "Army involvement",
    "Agricultural land cover",
    "Travel time to nearest urban center"
  ),
  omit = c("election", "Constant"),
  title = "Using a Linear Probability Model and binary outcome variables",
  label = "tab:regression-lpm",
  table.placement = "t",
  add.lines = additional_lines,
  font.size = "scriptsize",
  notes.append = FALSE,
  star.char = c("+", "*", "**"),
  notes = c("Robust standard errors in parentheses, clustered by district/county:", "+ p $<$ 0.1, * p $<$ 0.05, ** p $<$ 0.01"),
  type = "text"
  # type = "latex",
  # out = "table_e4.tex"
)

##### Test that the difference in coefficients is statistically significant ####

vcov_cluster <- vcovCL(model2, cluster = ~district)

# Test if police_involve = ppolice_involve

linearHypothesis(model2, "police_involve = ppolice_involve", vcov. = vcov_cluster)

# Test if police_involve = army_involve

linearHypothesis(model2, "police_involve = army_involve", vcov. = vcov_cluster)

##### Clean working environment ####

rm(
  model1, model2, model3, model4, model5, model6,
  cluster_se1, cluster_se2, cluster_se3, cluster_se4, cluster_se5, cluster_se6, vcov_cluster,
  additional_lines
)
